import openai
import json
from tqdm import tqdm
import time
import argparse
import os
from azfuse import Fuse

with open('credentials/openai_key.txt') as f:
  key = f.readlines()[0].strip()
  api_base = f.readlines()[1].strip()

openai.api_key = key
openai.api_base =api_base 
# openai.api_version = '2022-12-01' # this may change in the future
openai.api_version = "2023-07-01-preview" # this may change in the future
# deployment_id='gpt-4-32k-0314' #This will correspond to the custom name you chose for your deployment when you deployed a model.


IN_CONTEXT_Q_GEN_KNOWLEDGE_PROMPT = """
You are given a descriptive caption of an image. Generate a knowledge based answerable and an unanswerable question from the cation.
An unanswerable question requires external knowledge or commonsense that is not explicitly absent in the image to answer the question.
An answerable question requires commonsense knowledge not present in the image pixels but can be answered from the context.

Make the unanswerable and answerable questions as similar to each other as possible yet one is answerable and the other is unanswerable. 
Here are some examples:

Caption: In the center of the image, a vibrant blue lunch tray holds four containers, each brimming with a variety of food items. The containers, two in pink and two in yellow, are arranged in a 2x2 grid.\n\nIn the top left pink container, a slice of bread rests, lightly spread with butter and sprinkled with a handful of almonds. The bread is cut into a rectangle, and the almonds are scattered across its buttery surface.\n\nAdjacent to it in the top right corner, another pink container houses a mix of fruit. Sliced apples with their fresh white interiors exposed share the space with juicy chunks of pineapple. The colors of the apple slices and pineapple chunks contrast beautifully against the pink container.\n\nBelow these, in the bottom left corner of the tray, a yellow container holds a single meatball alongside some broccoli. The meatball, round and browned, sits next to the vibrant green broccoli florets.\n\nFinally, in the bottom right yellow container, there's a sweet treat - a chocolate chip cookie. The golden-brown cookie is dotted with chocolate chips, their dark color standing out against the cookie's lighter surface.\n\nThe arrangement of these containers on the blue tray creates a visually appealing and balanced meal, with each component neatly separated yet part of a cohesive whole.
Unanswerable Q: How many calories in this meal?
Answer: Unanswerable
Answerable Q: Which cuisine is the meal?
A: English meal 

Caption: This image captures a fascinating scene in a dense jungle. Two majestic, gray elephants are the main subjects of the photo. They are carrying people on their backs, who are seated in wooden seats and wearing helmets for safety. The elephants are walking in a line, one following the other, on a path that cuts through the lush greenery of the jungle. The photo is taken from a higher vantage point, providing a bird's eye view of the elephants and their verdant surroundings. The dense foliage and towering trees of the jungle envelop the path, creating a sense of adventure and exploration.
Unanswerable Question: What are the relationships between the people on the elephants?
Answer: Unanswerable
Answerable Question: Who are the people on the back of the elephants?
Answer: Most likely tourists

Keep in mind that you should make your question more natural, meaning that the question is plausible to be asked by a human. 

Please generate an unanswerable question and an answerable question for the given caption, in the following format:
- Q1: <Unanswerable question>
- A1: answer to Q1
- Q2: <Answerable question>
- A2: answer to Q2


DO NOT ask about anything that is difficult to observe or learn even with external knowledge, such as the exact time, exact location, the exact thought of someone, or the conversation or the topic of conversation between people. If you can only come up with such question, put "Not a good question" for A1.
"""


IN_CONTEXT_Q_GEN_PRED_PROMPT = """
You are given a caption of an image. Generate a question that requires to make predictions of future events from the time the image is captured requiring some temporal event reasoning that is not directly observable from the image. 
An unanswerable question requires temporal reasoning that cannot be inferred from the caption to answer the question.
An answerable question requires temporal commonsense and can be answered from the caption.

Make the unanswerable and answerable questions as similar to each other as possible yet one is answerable and the other is unanswerable. 
Do NOT ask about anything that is difficult to infer even if you observe the future events, such as the exact time, exact location, or the exact thought of someone.

Here are some examples:

Caption: The image showcases a captivating scene of a dressage routine being performed by two horses and their riders in a grassy field. The horse on the left is a majestic white stallion, while the one on the right is a striking black stallion. Both horses are displaying their strength and agility by rearing up on their hind legs, creating an impressive spectacle.\n\nThe riders, dressed in crisp white outfits and blue hats, appear to be in perfect sync with their horses. Their attire contrasts beautifully with the vibrant green of the field, adding to the overall aesthetic of the image.\n\nIn the background, colorful flags and obstacles can be seen, indicating that this might be a competitive event. The lush trees and shrubs further enhance the natural beauty of the setting.\n\nOverall, this image captures a moment of harmony between the riders and their horses, set against a backdrop of nature's splendor. It's a testament to the skill and grace involved in dressage.
Unanswerable Question: Are the two people riding the horses going to fall?
Answer: Unanswerbale
Answerable Question: Has the race started?
Answer: Yes

Caption: The image features two main objects placed on a white shelf against a white wall. \n\nOn the left, there is a charming **owl candle holder**. It is white in color, matching the overall aesthetic of the setting. The owl's intricate design is captivating, with its wide eyes and detailed feathers. A candle is lit within the holder, casting a warm and inviting glow.\n\nTo the right of the owl, there is an **antique-style clock**. The clock is predominantly white but is adorned with gold accents that add a touch of elegance. It has a round face displaying time with Roman numerals, further enhancing its antique appeal.\n\nThe objects are arranged neatly on the shelf, creating a harmonious and pleasing visual composition. The use of white for both the objects and the background gives the image a clean, minimalist aesthetic.
Unanswerable Question: Is the clock still working?
Answer: Unanswerable
Answerable Question: Is someone lighting the candle?
Answer: No, it is already lit.

Keep in mind that you should make your question more natural, meaning that the question is plausible to be asked by a human. 

IMPORTANT: The question and answer should only have temporal aspect.
IMPORTANT: The answerable question should have a clear and correct answer without any ambiguity.
Please generate an unanswerable question and an answerable question for the given caption, in the following format:
- Q1: <Unanswerable question>
- A1: answer to Q1
- Q2: <Answerable question>
- A2: answer to Q2

"""

# Keep in mind that you should make your question more natural, meaning that the question is plausible to be asked by a human. For example, the following question is not natural:
# - What is the precise location of ...?
# - What is the exact time when ...?
# - What is SOMEONE thinking about ...?
# - What was the exact thought going through SOMEONE ...?
# - How many seconds/minutes/hours will it take for ...?
# - How long will it take for ...?


IN_CONTEXT_Q_GEN_COMPLEX_PROMPT = """
You are given a caption of an image. Generate unanswerable questions that asks about an existing object in the image, but is too complex even for humans to answer. 
The unanswerable question should be extremely difficult in framing or tedious to infer the answer.
The answerable question should have a convoluted framing but should have an accurate and direct answer.

Here are some examples:

Caption: This image captures a serene moment in a zoo enclosure, where two majestic giraffes are seen in their natural behavior. The giraffes, adorned in their distinctive brown and white patterns, stand tall against the backdrop of lush green trees.\n\nOn the left, one giraffe is actively engaged in a meal, its long neck extended towards the tree as it munches on the verdant leaves. Its companion on the right stands leisurely next to a tree trunk, perhaps taking a break from its own leafy feast.\n\nThe enclosure they inhabit is grassy and spacious, providing them with ample room to roam and forage. The trees dotting the enclosure not only offer a source of food but also create a naturalistic habitat for these towering creatures.\n\nIn summary, this image is a snapshot of life in a zoo, showcasing the grace and beauty of giraffes in an environment designed to mimic their wild habitats.
Unanswerable Question: How many tree leaves are seen in the image? 
Answer: Unanswerable
Answerable Question: How many animal legs are present?
Answer: 8 legs of 2 girraffes

Caption: This image captures a fascinating scene in a dense jungle. Two majestic, gray elephants are the main subjects of the photo. They are carrying people on their backs, who are seated in wooden seats and wearing helmets for safety. The elephants are walking in a line, one following the other, on a path that cuts through the lush greenery of the jungle. The photo is taken from a higher vantage point, providing a bird's eye view of the elephants and their verdant surroundings. The dense foliage and towering trees of the jungle envelop the path, creating a sense of adventure and exploration.
Unanswerable complex question: What are the interactions of the individuals on the elephants' backs with the environment?
Answer: Unanswerable
Answerable complex question: A couple of living beings are carrying another couple of living beings. What are the latter living beings?
Answer: Humans

IMPORTANT: COMPLEXITY OF THE QUESTION SHOULD BE ONLY AND ONLY BASED ON DIFFICULTY TO ANSWER OR FRAMING OF THE QUESTION.
THEY SHOULD NOT REQUIRE ADDITIONAL INFORMATION.

Please generate an unanswerable question and an answerable question for the given caption, in the following format:
- Q1: <unanswerable question>
- A1: answer to Q1
- Q2: <answerable question>
- A2: answer to Q2
"""

IN_CONTEXT_Q_GEN_AMBIGUITY_PROMPT = """
You are given a caption of an image. Generate unanswerable questions that asks about an existing object in the caption, but is ambiguous. 
DEFINITION: Ambiguity refers to a situation or statement that can be understood or interpreted in multiple ways. It often involves uncertainty or lack of clarity, leading to confusion or different possible meanings.

The unanswerable question should be ambiguous because of indifferentiablity of objects or people mentioned in the question. As a result without clarification, multiple answers are possible.
The answerable question should have a convoluted framing but should have an accurate and direct answer.

Here are some examples:

Caption: This image captures a serene moment in a zoo enclosure, where two majestic giraffes are seen in their natural behavior. The giraffes, adorned in their distinctive brown and white patterns, stand tall against the backdrop of lush green trees.\n\nOn the left, one giraffe is actively engaged in a meal, its long neck extended towards the tree as it munches on the verdant leaves. Its companion on the right stands leisurely next to a tree trunk, perhaps taking a break from its own leafy feast.\n\nThe enclosure they inhabit is grassy and spacious, providing them with ample room to roam and forage. The trees dotting the enclosure not only offer a source of food but also create a naturalistic habitat for these towering creatures.\n\nIn summary, this image is a snapshot of life in a zoo, showcasing the grace and beauty of giraffes in an environment designed to mimic their wild habitats.
Unanswerable Question: What is the giraffe doing?
Answer: There are multiple giraffes. Unanswerable
Answerable Question: Where are the people sitting?
Answer: All people are sitting on elephants' backs.

Caption: This image captures a fascinating scene in a dense jungle. Two majestic, gray elephants are the main subjects of the photo. They are carrying people on their backs, who are seated in wooden seats and wearing helmets for safety. The elephants are walking in a line, one following the other, on a path that cuts through the lush greenery of the jungle. The photo is taken from a higher vantage point, providing a bird's eye view of the elephants and their verdant surroundings. The dense foliage and towering trees of the jungle envelop the path, creating a sense of adventure and exploration.
Unanswerable ambiguous question: Is the bird's eye view from the top of a tree or from a nearby mountain or a drone?
Answer: All options are possible. Unanswerable
Answerable unambiguous question: What are the people on the elephants' backs wearing?
Answer: Helmets

IMPORTANT: AMBIGUITY OF THE QUESTION SHOULD BE ONLY AND ONLY BASED ON THE POSSIBILITY OF MULTIPLE ANSWERS.
THEY SHOULD NOT REQUIRE ADDITIONAL INFORMATION.

Please generate an unanswerable question and an answerable question for the given caption, in the following format:
- Q1: <unanswerable question>
- A1: answer to Q1
- Q2: <answerable question>
- A2: answer to Q2
"""


OVERALL_Q_GEN_PROMPT = """
You are given a caption of an image. Imagine that you can see the image, generate a few questions for each category listed below that are not directly answerable and are answerable given the image context. Consider the following types of question:

- Category 1 - External knowledge: Generate a knowledge based answerable and an unanswerable question from the cation. An unanswerable question requires external knowledge or commonsense that is not explicitly absent in the image to answer the question. An answerable question requires commonsense knowledge not present in the image pixels but can be answered from the context.
Here are 2 examples:
Caption: In the center of the image, a vibrant blue lunch tray holds four containers, each brimming with a variety of food items. The containers, two in pink and two in yellow, are arranged in a 2x2 grid.\n\nIn the top left pink container, a slice of bread rests, lightly spread with butter and sprinkled with a handful of almonds. The bread is cut into a rectangle, and the almonds are scattered across its buttery surface.\n\nAdjacent to it in the top right corner, another pink container houses a mix of fruit. Sliced apples with their fresh white interiors exposed share the space with juicy chunks of pineapple. The colors of the apple slices and pineapple chunks contrast beautifully against the pink container.\n\nBelow these, in the bottom left corner of the tray, a yellow container holds a single meatball alongside some broccoli. The meatball, round and browned, sits next to the vibrant green broccoli florets.\n\nFinally, in the bottom right yellow container, there's a sweet treat - a chocolate chip cookie. The golden-brown cookie is dotted with chocolate chips, their dark color standing out against the cookie's lighter surface.\n\nThe arrangement of these containers on the blue tray creates a visually appealing and balanced meal, with each component neatly separated yet part of a cohesive whole.
Unanswerable Question: How many calories in this meal?
Answer: Unanswerable
Answerable Question: Which cuisine is the meal?
Answer: English meal 

Caption: This image captures a fascinating scene in a dense jungle. Two majestic, gray elephants are the main subjects of the photo. They are carrying people on their backs, who are seated in wooden seats and wearing helmets for safety. The elephants are walking in a line, one following the other, on a path that cuts through the lush greenery of the jungle. The photo is taken from a higher vantage point, providing a bird's eye view of the elephants and their verdant surroundings. The dense foliage and towering trees of the jungle envelop the path, creating a sense of adventure and exploration.
Unanswerable Question: What are the relationships between the people on the elephants?
Answer: Unanswerable
Answerable Question: Who are the people on the back of the elephants?
Answer: Most likely tourists

- Category 2 - Temporal: Generate a question that requires to make predictions of future events from the time the image is captured requiring some temporal event reasoning that is not directly observable from the image. An unanswerable question requires temporal reasoning that cannot be inferred from the caption to answer the question. An answerable question requires temporal commonsense and can be answered from the caption.
Here are 2 examples:
Caption: The image showcases a captivating scene of a dressage routine being performed by two horses and their riders in a grassy field. The horse on the left is a majestic white stallion, while the one on the right is a striking black stallion. Both horses are displaying their strength and agility by rearing up on their hind legs, creating an impressive spectacle.\n\nThe riders, dressed in crisp white outfits and blue hats, appear to be in perfect sync with their horses. Their attire contrasts beautifully with the vibrant green of the field, adding to the overall aesthetic of the image.\n\nIn the background, colorful flags and obstacles can be seen, indicating that this might be a competitive event. The lush trees and shrubs further enhance the natural beauty of the setting.\n\nOverall, this image captures a moment of harmony between the riders and their horses, set against a backdrop of nature's splendor. It's a testament to the skill and grace involved in dressage.
Unanswerable Question: Are the two people riding the horses going to fall?
Answer: Unanswerbale
Answerable Question: Has the race started?
Answer: Yes

Caption: The image features two main objects placed on a white shelf against a white wall. \n\nOn the left, there is a charming **owl candle holder**. It is white in color, matching the overall aesthetic of the setting. The owl's intricate design is captivating, with its wide eyes and detailed feathers. A candle is lit within the holder, casting a warm and inviting glow.\n\nTo the right of the owl, there is an **antique-style clock**. The clock is predominantly white but is adorned with gold accents that add a touch of elegance. It has a round face displaying time with Roman numerals, further enhancing its antique appeal.\n\nThe objects are arranged neatly on the shelf, creating a harmonious and pleasing visual composition. The use of white for both the objects and the background gives the image a clean, minimalist aesthetic.
Unanswerable Question: Is the clock still working?
Answer: Unanswerable
Answerable Question: Is someone lighting the candle?
Answer: No, it is already lit.

- Category 3 - Complex: Generate unanswerable questions that asks about an existing object in the image, but is too complex even for humans to answer.  The unanswerable question should be extremely difficult in framing or tedious to infer the answer. The answerable question should have a convoluted framing but should have an accurate and direct answer.
Here are 2 examples:

Caption: This image captures a serene moment in a zoo enclosure, where two majestic giraffes are seen in their natural behavior. The giraffes, adorned in their distinctive brown and white patterns, stand tall against the backdrop of lush green trees.\n\nOn the left, one giraffe is actively engaged in a meal, its long neck extended towards the tree as it munches on the verdant leaves. Its companion on the right stands leisurely next to a tree trunk, perhaps taking a break from its own leafy feast.\n\nThe enclosure they inhabit is grassy and spacious, providing them with ample room to roam and forage. The trees dotting the enclosure not only offer a source of food but also create a naturalistic habitat for these towering creatures.\n\nIn summary, this image is a snapshot of life in a zoo, showcasing the grace and beauty of giraffes in an environment designed to mimic their wild habitats.
Unanswerable Question: How many tree leaves are seen in the image? 
Answer: Unanswerable
Answerable Question: How many animal legs are present?
Answer: 8 legs of 2 girraffes

Caption: This image captures a fascinating scene in a dense jungle. Two majestic, gray elephants are the main subjects of the photo. They are carrying people on their backs, who are seated in wooden seats and wearing helmets for safety. The elephants are walking in a line, one following the other, on a path that cuts through the lush greenery of the jungle. The photo is taken from a higher vantage point, providing a bird's eye view of the elephants and their verdant surroundings. The dense foliage and towering trees of the jungle envelop the path, creating a sense of adventure and exploration.
Unanswerable question: What are the interactions of the individuals on the elephants' backs with the environment?
Answer: Unanswerable
Answerable question: A couple of living beings are carrying another couple of living beings. What are the latter living beings?
Answer: Humans

- Category 4 - Ambiguity: It refers to a situation or statement that can be understood or interpreted in multiple ways. It often involves uncertainty or lack of clarity, leading to confusion or different possible meanings. The unanswerable question should be ambiguous and it should not have a clear answer, rather multiple answers should be possible. The answerable question should have a convoluted framing but should have an accurate and direct answer.

Here are some examples:
Caption: This image captures a serene moment in a zoo enclosure, where two majestic giraffes are seen in their natural behavior. The giraffes, adorned in their distinctive brown and white patterns, stand tall against the backdrop of lush green trees.\n\nOn the left, one giraffe is actively engaged in a meal, its long neck extended towards the tree as it munches on the verdant leaves. Its companion on the right stands leisurely next to a tree trunk, perhaps taking a break from its own leafy feast.\n\nThe enclosure they inhabit is grassy and spacious, providing them with ample room to roam and forage. The trees dotting the enclosure not only offer a source of food but also create a naturalistic habitat for these towering creatures.\n\nIn summary, this image is a snapshot of life in a zoo, showcasing the grace and beauty of giraffes in an environment designed to mimic their wild habitats.
Unanswerable Question: What is the giraffe doing?
Answer: There are multiple giraffes. Unanswerable
Answerable Question: Where are the people sitting?
Answer: All people are sitting on elephants' backs.

Caption: This image captures a fascinating scene in a dense jungle. Two majestic, gray elephants are the main subjects of the photo. They are carrying people on their backs, who are seated in wooden seats and wearing helmets for safety. The elephants are walking in a line, one following the other, on a path that cuts through the lush greenery of the jungle. The photo is taken from a higher vantage point, providing a bird's eye view of the elephants and their verdant surroundings. The dense foliage and towering trees of the jungle envelop the path, creating a sense of adventure and exploration.
Unanswerable question: Is the bird's eye view from the top of a tree or from a nearby mountain or a drone?
Answer: All options are possible. Unanswerable
Answerable question: What are the people on the elephants' backs wearing?
Answer: Helmets

Following the format below to generate questions, provide a rationale why each question is answerable or unanswerable along with the category. Keep in mind that although you are not directly seeing the image, anything that may be observed from the image is considered as the context, even if not mentioned in caption.
- Q1: <unanswerable question>
- R1: rationale why Q1 is unanswerable
- C1: category of Q1
- Q2: <answerable question>
- R2: rationale why Q2 is unanswerable
- C2: category of Q2
"""

# Keep in mind that you should make your question more natural, meaning that the question is plausible to be asked by a human. For example, the following question is not natural:
# - What is the precise count of ...?
# - What is the exact number of ...?
# - What is the exact time when ...?
# - How many seconds/minutes/hours will it take for ...?
# - How long will it take for ...?

def gen_na_w_caption(annotation_file="<DATA_FOLDER>/docci/docci_descriptions.jsonlines",
      deployment_id="gpt4",
      max_num_retries=5,
      output_folder="<DATA_FOLDER>/docci/",
      q_type="overall",
      debug=False,
      overwrite=False):

    if annotation_file.endswith(".json"):
      data = json.load(File.open(annotation_file))
    else:
      with open(annotation_file) as f:
        data = f.readlines()
        data = [json.loads(d) for d in data]
    prompt = {
      "pred": IN_CONTEXT_Q_GEN_PRED_PROMPT,
      "know": IN_CONTEXT_Q_GEN_KNOWLEDGE_PROMPT,
      "complex": IN_CONTEXT_Q_GEN_COMPLEX_PROMPT,
      "ambiguity": IN_CONTEXT_Q_GEN_AMBIGUITY_PROMPT,
      "overall": OVERALL_Q_GEN_PROMPT
    }
    output_folder = os.path.join(output_folder, f"gpt4_gen_{q_type}_q")
    if debug:
        data = data[:10]
    os.makedirs(output_folder, exist_ok=True)

    for idx, d in tqdm(enumerate(data), total=len(data)):
        if "image" in d:
          image_id = d["image"]
          caption = d["conversations"][-1]["value"]
          output_file = os.path.join(output_folder, image_id.replace("/", "_").replace(".jpg", ".txt"))
        else:
          image_id = d["example_id"]
          caption = d["description"]
          output_file = os.path.join(output_folder, image_id+".txt")
        messages = [
        {"role": "system", "content": prompt[q_type]},
        {"role": "user", "content": "Caption: " + caption+"\n"},
        ]
        
        if os.path.exists(output_file) and not overwrite:
            continue
        tries = 0
        while tries < max_num_retries:
            try:
                response = openai.ChatCompletion.create(
                    engine=deployment_id,
                    messages = messages,
                    temperature=1,
                    max_tokens=1024,
                    # top_p=0.95,
                    # frequency_penalty=0,
                    # presence_penalty=0,
                    # stop=None
                    )
                content = response['choices'][0]['message']['content']
                tries += 1
            except Exception as e:
                str_e = f"{e}"
                if "content management policy" in str_e:
                    print("Skipping due to content management policy")
                    break
                print(f"Failed to call GPT-4 ({e}), sleep 2s")
                time.sleep(2)
                continue
            with open(output_file, "w") as f:
                f.write(content)
            break

if __name__ == '__main__':
  import fire
  fire.Fire(gen_na_w_caption)